R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

data <- data.frame(read.csv("googleplaystore.csv"))

Apps

library(ggplot2)

#Display all the duplicated Apps
duplicate_apps <- aggregate(App ~ ., data = data, FUN = length)  
duplicate_apps <- duplicate_apps[duplicate_apps$App > 1, ] 
duplicate_apps <- duplicate_apps[order(-duplicate_apps$App), ]  
#View(duplicate_apps)
#print(duplicate_apps)
print(paste("Number of duplicated Apps:",nrow(duplicate_apps)))
## [1] "Number of duplicated Apps: 404"
#Removing Na values and duplicates
data_clean <- data[!is.na(data$App), ] 
data_clean <- data_clean[!duplicated(data_clean$App), ] 

#(After removing the duplicates)Unique values
unique_apps <- length(unique(data_clean$App))
print(paste("Number of unique apps after removing the duplicates:", unique_apps))
## [1] "Number of unique apps after removing the duplicates: 9660"

Nearly 404 apps have been repeated twice and thrice. After removing all the duplicated app names, there are 9660 unique apps in the data frame. (1181 values removed)

Below is the dataframe with number of unique values and NA value for each variables in the dataset after removing the duplicates.

#DataFrame includes unique values and Na for all variables in data after removing duplicates
unique_values_list <- lapply(data_clean, unique)  
unique_counts_list <- lapply(data_clean, function(col) length(unique(col)))
null_counts_list <- lapply(data_clean, function(col) sum(is.na(col)))  

unique_df <- data.frame(
  Unique_Values = sapply(unique_values_list, function(x) paste(x, collapse = ", ")),  
  Unique_Counts = unlist(unique_counts_list),
  Null_Counts = unlist(null_counts_list)
)

Price

typeof(data_clean$Price)
## [1] "character"

Convert Price to numerical is required

#To check if there is dollar symbol present 
#data_clean$Price[]
# Remove dollar symbols and convert to numeric
data_clean$Price <- as.numeric(gsub("\\$", "", data_clean$Price))
#Recheck for dollar symbol
#data_clean$Price[]
# Summary statistics for price
summary(data_clean$Price)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   0.000   0.000   1.099   0.000 400.000       1

From the unique_df, there is a missing value present in the Price column.

#Checking for missing values in Price
missing_na <- is.na(data_clean$Price)    
missing_blank <- data_clean$Price == "" 

sum(missing_na)
## [1] 1
sum(missing_blank, na.rm = TRUE)
## [1] 0
# Remove row where Price is NA or blank
data_clean <- data_clean[!is.na(data_clean$Price) & data_clean$Price != "", ]

Have removed one row #10473 which app does not have a category name.(not required)

#Recheck for missing values
summary(data_clean$Price)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   1.099   0.000 400.000

Missing values removed succesfully.(Price)

#Checking the distribution of prices using histogram
library(ggplot2)

ggplot(data_clean, aes(x=Price)) +
  geom_histogram(binwidth=2, fill="pink", color="black") +
   xlim(0, 500) + ylim(0, 500) +
  labs(title="Price Distribution", x="Price", y="Frequency") +
  theme_minimal()

The data is highly skewed as there are many zero price entries.

# Boxplot for the same
ggplot(data_clean, aes(y=Price)) +
  geom_boxplot(outlier.colour = "red", outlier.shape = 16, outlier.size = 1, fill="pink", color="black") +
  labs(title="Price Boxplot", y="Price") +
  theme_minimal()

outlierKD2 <- function(df, var, rm = FALSE, boxplt = FALSE, histogram = TRUE, qqplt = FALSE) {
  dt <- df  # Duplicate the dataframe for potential alteration
  var_name <- eval(substitute(var), eval(dt))
  na1 <- sum(is.na(var_name))
  m1 <- mean(var_name, na.rm = TRUE)
  colTotal <- boxplt + histogram + qqplt  # Calculate the total number of charts to be displayed
  par(mfrow = c(2, max(2, colTotal)), oma = c(0, 0, 3, 0))  # Adjust layout for plots
  
  # Q-Q plot with custom title
  if (qqplt) {
    qqnorm(var_name, main="Q-Q plot without Outliers")
    qqline(var_name)
  }
  
  # Histogram with custom title
  if (histogram) { 
    hist(var_name,main = "Histogram without Outliers", xlab = NA, ylab = NA) 
  }
  
  # Box plot with custom title
  if (boxplt) { 
    boxplot(var_name, main= "Box Plot without Outliers")
  }
  
  # Identify outliers
  outlier <- boxplot.stats(var_name)$out
  mo <- mean(outlier)
  var_name <- ifelse(var_name %in% outlier, NA, var_name)
  
  # Q-Q plot without outliers
  if (qqplt) {
    qqnorm(var_name, main="Q-Q plot with Outliers")
    qqline(var_name)
  }
  
  # Histogram without outliers
  if (histogram) { 
    hist(var_name, main = "Histogram with Outliers", xlab = NA, ylab = NA) 
  }
  
  # Box plot without outliers
  if (boxplt) { 
    boxplot(var_name, main = "Boxplot with Outliers") 
  }
  
  # Add the title for the overall plot section if any plots are displayed
  if (colTotal > 0) {
    title("Outlier Check", outer = TRUE)
    na2 <- sum(is.na(var_name))
    cat("Outliers identified:", na2 - na1, "\n")
    cat("Proportion (%) of outliers:", round((na2 - na1) / sum(!is.na(var_name)) * 100, 1), "\n")
    cat("Mean of the outliers:", round(mo, 2), "\n")
    cat("Mean without removing outliers:", round(m1, 2), "\n")
    cat("Mean if we remove outliers:", round(mean(var_name, na.rm = TRUE), 2), "\n")
  }
  
  # Remove outliers if `rm = TRUE`
  if (rm) {
    dt[as.character(substitute(var))] <- invisible(var_name)
    cat("Outliers successfully removed", "\n")
    return(invisible(dt))
  } else {
    cat("Nothing changed", "\n")
    return(invisible(df))
  }
}
outlier_check_price = outlierKD2(data_clean, Price, rm = FALSE, boxplt = TRUE, qqplt = TRUE)

## Outliers identified: 756 
## Proportion (%) of outliers: 8.5 
## Mean of the outliers: 14.05 
## Mean without removing outliers: 1.1 
## Mean if we remove outliers: 0 
## Nothing changed

The price values here are valid observations for our analysis(both typical and extreme values), so removing these outliers might not be useful.

#To check the value ranges
table(data_clean$Price)
## 
##      0   0.99      1   1.04    1.2   1.26   1.29   1.49    1.5   1.59   1.61 
##   8903    145      3      1      1      1      1     46      1      1      1 
##    1.7   1.75   1.76   1.96   1.97   1.99      2   2.49    2.5   2.56   2.59 
##      2      1      1      1      1     73      3     25      1      1      1 
##    2.6    2.9   2.95   2.99   3.02   3.04   3.08   3.28   3.49   3.61   3.88 
##      1      1      1    124      1      1      1      1      7      1      1 
##    3.9   3.95   3.99   4.29   4.49   4.59    4.6   4.77    4.8   4.84   4.85 
##      1      1     57      1      9      1      1      1      1      1      1 
##   4.99      5   5.49   5.99   6.49   6.99   7.49   7.99   8.49   8.99      9 
##     70      1      5     26      5     11      2      7      2      5      1 
##   9.99     10  10.99  11.99  12.99  13.99     14  14.99  15.46  15.99  16.99 
##     19      2      2      3      4      2      1      9      1      1      2 
##  17.99  18.99   19.4   19.9  19.99  24.99  25.99  28.99  29.99  30.99  33.99 
##      2      1      1      1      5      3      1      1      5      1      1 
##  37.99  39.99  46.99  74.99  79.99  89.99 109.99 154.99    200 299.99 379.99 
##      1      2      1      1      1      1      1      1      1      1      1 
## 389.99 394.99 399.99    400 
##      1      1     12      1

Type

table(data_clean$Type)
## 
## Free Paid 
## 8902  756
#Missing values
print(paste("Missing values:",sum(is.na(data_clean$Type))))
## [1] "Missing values: 0"
data_clean[is.na(data_clean$Type), ]
##  [1] App            Category       Rating         Reviews        Size          
##  [6] Installs       Type           Price          Content.Rating Genres        
## [11] Last.Updated   Current.Ver    Android.Ver   
## <0 rows> (or 0-length row.names)

There is one row 9150, has a missing value for Type. As the price is 0, replaced it with “Free”.

# Replace NaN or missing values in the Type column with "Free"
data_clean$Type[is.na(data_clean$Type)] <- "Free"
ggplot(data_clean, aes(x = Type)) +
  geom_bar(fill = "pink", color = "black") +
  labs(title = "Distribution of App Types (Free vs Paid)", x = "Type", y = "Count") +
  theme_minimal()

data_clean$Type <- as.factor(data_clean$Type)


summary_by_type <- data.frame(
  Type = levels(data_clean$Type),
  Min_Price = tapply(data_clean$Price, data_clean$Type, min, na.rm = TRUE),
  Max_Price = tapply(data_clean$Price, data_clean$Type, max, na.rm = TRUE),
  Mean_Price = tapply(data_clean$Price, data_clean$Type, mean, na.rm = TRUE),
  Median_Price = tapply(data_clean$Price, data_clean$Type, median, na.rm = TRUE)
)


print(summary_by_type)
##      Type Min_Price Max_Price Mean_Price Median_Price
## Free Free      0.00         0    0.00000         0.00
## NaN   NaN      0.00         0    0.00000         0.00
## Paid Paid      0.99       400   14.04515         2.99
ggplot(data_clean, aes(x = Type, y = Price, fill = Type)) +
  geom_boxplot() +
  labs(title = "Price Distribution by App Type", 
       x = "App Type", 
       y = "Price ($)") +
  theme_minimal()

ggplot(data_clean, aes(x = Price, fill = Type)) +
  geom_histogram(binwidth = 60, alpha = 0.7, position = "identity") +
  facet_wrap(~ Type) +
  labs(title = "Price Distribution by App Type", 
       x = "Price ($)", 
       y = "Count") +
  theme_minimal()

Here, by analysing the price distribution by app types, there are some incorrect values in the Type column that are not correctly representing the price of the apps. Hence, as we can completely relu on the prices, the type column is not required for our analysis.

Removing Type column…

#Using subset function
data_clean <- subset(data_clean, select = -Type)
str(data_clean)
## 'data.frame':    9659 obs. of  12 variables:
##  $ App           : chr  "Photo Editor & Candy Camera & Grid & ScrapBook" "Coloring book moana" "U Launcher Lite – FREE Live Cool Themes, Hide Apps" "Sketch - Draw & Paint" ...
##  $ Category      : chr  "ART_AND_DESIGN" "ART_AND_DESIGN" "ART_AND_DESIGN" "ART_AND_DESIGN" ...
##  $ Rating        : num  4.1 3.9 4.7 4.5 4.3 4.4 3.8 4.1 4.4 4.7 ...
##  $ Reviews       : chr  "159" "967" "87510" "215644" ...
##  $ Size          : chr  "19M" "14M" "8.7M" "25M" ...
##  $ Installs      : chr  "10,000+" "500,000+" "5,000,000+" "50,000,000+" ...
##  $ Price         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Content.Rating: chr  "Everyone" "Everyone" "Everyone" "Teen" ...
##  $ Genres        : chr  "Art & Design" "Art & Design;Pretend Play" "Art & Design" "Art & Design" ...
##  $ Last.Updated  : chr  "January 7, 2018" "January 15, 2018" "August 1, 2018" "June 8, 2018" ...
##  $ Current.Ver   : chr  "1.0.0" "2.0.0" "1.2.4" "Varies with device" ...
##  $ Android.Ver   : chr  "4.0.3 and up" "4.0.3 and up" "4.0.3 and up" "4.2 and up" ...
head(data_clean)
##                                                  App       Category Rating
## 1     Photo Editor & Candy Camera & Grid & ScrapBook ART_AND_DESIGN    4.1
## 2                                Coloring book moana ART_AND_DESIGN    3.9
## 3 U Launcher Lite – FREE Live Cool Themes, Hide Apps ART_AND_DESIGN    4.7
## 4                              Sketch - Draw & Paint ART_AND_DESIGN    4.5
## 5              Pixel Draw - Number Art Coloring Book ART_AND_DESIGN    4.3
## 6                         Paper flowers instructions ART_AND_DESIGN    4.4
##   Reviews Size    Installs Price Content.Rating                    Genres
## 1     159  19M     10,000+     0       Everyone              Art & Design
## 2     967  14M    500,000+     0       Everyone Art & Design;Pretend Play
## 3   87510 8.7M  5,000,000+     0       Everyone              Art & Design
## 4  215644  25M 50,000,000+     0           Teen              Art & Design
## 5     967 2.8M    100,000+     0       Everyone   Art & Design;Creativity
## 6     167 5.6M     50,000+     0       Everyone              Art & Design
##       Last.Updated        Current.Ver  Android.Ver
## 1  January 7, 2018              1.0.0 4.0.3 and up
## 2 January 15, 2018              2.0.0 4.0.3 and up
## 3   August 1, 2018              1.2.4 4.0.3 and up
## 4     June 8, 2018 Varies with device   4.2 and up
## 5    June 20, 2018                1.1   4.4 and up
## 6   March 26, 2017                1.0   2.3 and up

The Type column is successfully removed.

head(data_clean)
##                                                  App       Category Rating
## 1     Photo Editor & Candy Camera & Grid & ScrapBook ART_AND_DESIGN    4.1
## 2                                Coloring book moana ART_AND_DESIGN    3.9
## 3 U Launcher Lite – FREE Live Cool Themes, Hide Apps ART_AND_DESIGN    4.7
## 4                              Sketch - Draw & Paint ART_AND_DESIGN    4.5
## 5              Pixel Draw - Number Art Coloring Book ART_AND_DESIGN    4.3
## 6                         Paper flowers instructions ART_AND_DESIGN    4.4
##   Reviews Size    Installs Price Content.Rating                    Genres
## 1     159  19M     10,000+     0       Everyone              Art & Design
## 2     967  14M    500,000+     0       Everyone Art & Design;Pretend Play
## 3   87510 8.7M  5,000,000+     0       Everyone              Art & Design
## 4  215644  25M 50,000,000+     0           Teen              Art & Design
## 5     967 2.8M    100,000+     0       Everyone   Art & Design;Creativity
## 6     167 5.6M     50,000+     0       Everyone              Art & Design
##       Last.Updated        Current.Ver  Android.Ver
## 1  January 7, 2018              1.0.0 4.0.3 and up
## 2 January 15, 2018              2.0.0 4.0.3 and up
## 3   August 1, 2018              1.2.4 4.0.3 and up
## 4     June 8, 2018 Varies with device   4.2 and up
## 5    June 20, 2018                1.1   4.4 and up
## 6   March 26, 2017                1.0   2.3 and up
tail(data_clean)
##                                                 App            Category Rating
## 10836                                      FR Forms            BUSINESS    NaN
## 10837                              Sya9a Maroc - FR              FAMILY    4.5
## 10838              Fr. Mike Schmitz Audio Teachings              FAMILY    5.0
## 10839                        Parkinson Exercices FR             MEDICAL    NaN
## 10840                 The SCP Foundation DB fr nn5n BOOKS_AND_REFERENCE    4.5
## 10841 iHoroscope - 2018 Daily Horoscope & Astrology           LIFESTYLE    4.5
##       Reviews               Size    Installs Price Content.Rating
## 10836       0               9.6M         10+     0       Everyone
## 10837      38                53M      5,000+     0       Everyone
## 10838       4               3.6M        100+     0       Everyone
## 10839       3               9.5M      1,000+     0       Everyone
## 10840     114 Varies with device      1,000+     0     Mature 17+
## 10841  398307                19M 10,000,000+     0       Everyone
##                  Genres       Last.Updated        Current.Ver
## 10836          Business September 29, 2016              1.1.5
## 10837         Education      July 25, 2017               1.48
## 10838         Education       July 6, 2018                1.0
## 10839           Medical   January 20, 2017                1.0
## 10840 Books & Reference   January 19, 2015 Varies with device
## 10841         Lifestyle      July 25, 2018 Varies with device
##              Android.Ver
## 10836         4.0 and up
## 10837         4.1 and up
## 10838         4.1 and up
## 10839         2.2 and up
## 10840 Varies with device
## 10841 Varies with device
str(data_clean)
## 'data.frame':    9659 obs. of  12 variables:
##  $ App           : chr  "Photo Editor & Candy Camera & Grid & ScrapBook" "Coloring book moana" "U Launcher Lite – FREE Live Cool Themes, Hide Apps" "Sketch - Draw & Paint" ...
##  $ Category      : chr  "ART_AND_DESIGN" "ART_AND_DESIGN" "ART_AND_DESIGN" "ART_AND_DESIGN" ...
##  $ Rating        : num  4.1 3.9 4.7 4.5 4.3 4.4 3.8 4.1 4.4 4.7 ...
##  $ Reviews       : chr  "159" "967" "87510" "215644" ...
##  $ Size          : chr  "19M" "14M" "8.7M" "25M" ...
##  $ Installs      : chr  "10,000+" "500,000+" "5,000,000+" "50,000,000+" ...
##  $ Price         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Content.Rating: chr  "Everyone" "Everyone" "Everyone" "Teen" ...
##  $ Genres        : chr  "Art & Design" "Art & Design;Pretend Play" "Art & Design" "Art & Design" ...
##  $ Last.Updated  : chr  "January 7, 2018" "January 15, 2018" "August 1, 2018" "June 8, 2018" ...
##  $ Current.Ver   : chr  "1.0.0" "2.0.0" "1.2.4" "Varies with device" ...
##  $ Android.Ver   : chr  "4.0.3 and up" "4.0.3 and up" "4.0.3 and up" "4.2 and up" ...

Now that the price and Apps cleaning and Analysis is done.Now lets proceed with Ratings and Reviews.

Checking the format of Rating and Reviews

##  chr [1:9659] "159" "967" "87510" "215644" "967" "167" "178" "36815" ...
##  num [1:9659] 4.1 3.9 4.7 4.5 4.3 4.4 3.8 4.1 4.4 4.7 ...

As we can see the Review column is in string format which could be converted into int for more insights

##Change the column reviews from Str to int

## 'data.frame':    9659 obs. of  12 variables:
##  $ App           : chr  "Photo Editor & Candy Camera & Grid & ScrapBook" "Coloring book moana" "U Launcher Lite – FREE Live Cool Themes, Hide Apps" "Sketch - Draw & Paint" ...
##  $ Category      : chr  "ART_AND_DESIGN" "ART_AND_DESIGN" "ART_AND_DESIGN" "ART_AND_DESIGN" ...
##  $ Rating        : num  4.1 3.9 4.7 4.5 4.3 4.4 3.8 4.1 4.4 4.7 ...
##  $ Reviews       : num  159 967 87510 215644 967 ...
##  $ Size          : chr  "19M" "14M" "8.7M" "25M" ...
##  $ Installs      : chr  "10,000+" "500,000+" "5,000,000+" "50,000,000+" ...
##  $ Price         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Content.Rating: chr  "Everyone" "Everyone" "Everyone" "Teen" ...
##  $ Genres        : chr  "Art & Design" "Art & Design;Pretend Play" "Art & Design" "Art & Design" ...
##  $ Last.Updated  : chr  "January 7, 2018" "January 15, 2018" "August 1, 2018" "June 8, 2018" ...
##  $ Current.Ver   : chr  "1.0.0" "2.0.0" "1.2.4" "Varies with device" ...
##  $ Android.Ver   : chr  "4.0.3 and up" "4.0.3 and up" "4.0.3 and up" "4.2 and up" ...
Table: Statistics summary.
App Category Rating Reviews Size Installs Price Content.Rating Genres Last.Updated Current.Ver Android.Ver
Min Length:9659 Length:9659 Min. :1.000 Min. : 0 Length:9659 Length:9659 Min. : 0.000 Length:9659 Length:9659 Length:9659 Length:9659 Length:9659
Q1 Class :character Class :character 1st Qu.:4.000 1st Qu.: 25 Class :character Class :character 1st Qu.: 0.000 Class :character Class :character Class :character Class :character Class :character
Median Mode :character Mode :character Median :4.300 Median : 967 Mode :character Mode :character Median : 0.000 Mode :character Mode :character Mode :character Mode :character Mode :character
Mean NA NA Mean :4.173 Mean : 216593 NA NA Mean : 1.099 NA NA NA NA NA
Q3 NA NA 3rd Qu.:4.500 3rd Qu.: 29401 NA NA 3rd Qu.: 0.000 NA NA NA NA NA
Max NA NA Max. :5.000 Max. :78158306 NA NA Max. :400.000 NA NA NA NA NA
NA NA NA NA’s :1463 NA NA NA NA NA NA NA NA NA

There are 1463 missing values in rating.

#Distribution of NA Ratings by Category.

df_na_rating <- data_clean %>% filter(is.na(Rating))

# Group by Category and count the number of NA ratings for each category
na_rating_distribution <- df_na_rating %>%
  group_by(Category) %>%
  summarise(count = n()) %>%
  arrange(desc(count))


ggplot(na_rating_distribution, aes(x = reorder(Category, -count), y = count)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  geom_text(aes(label = count), 
            position = position_stack(vjust = 0.5),  # Center the text within the bars
            color = "white", size = 3) +  # Adjust text color and size
  coord_flip() +  
  theme_minimal() +
  labs(title = "Distribution of NA Ratings by Category",
       x = "Category",
       y = "Count of NA Ratings") +
  theme(axis.text.y = element_text(size = 8))

As it could observed the Family category apps have the highest NA values.

#We can replace the NA values with the average instead of removing them to extract more information
library(dplyr)

# Method 1: Replace NA in Ratings with Overall Mean
data_clean1 <- data_clean %>%
  mutate(Rating = ifelse(is.na(Rating), mean(Rating, na.rm = TRUE), Rating))

xkablesummary(data_clean1)
Table: Statistics summary.
App Category Rating Reviews Size Installs Price Content.Rating Genres Last.Updated Current.Ver Android.Ver
Min Length:9659 Length:9659 Min. :1.000 Min. : 0 Length:9659 Length:9659 Min. : 0.000 Length:9659 Length:9659 Length:9659 Length:9659 Length:9659
Q1 Class :character Class :character 1st Qu.:4.000 1st Qu.: 25 Class :character Class :character 1st Qu.: 0.000 Class :character Class :character Class :character Class :character Class :character
Median Mode :character Mode :character Median :4.200 Median : 967 Mode :character Mode :character Median : 0.000 Mode :character Mode :character Mode :character Mode :character Mode :character
Mean NA NA Mean :4.173 Mean : 216593 NA NA Mean : 1.099 NA NA NA NA NA
Q3 NA NA 3rd Qu.:4.500 3rd Qu.: 29401 NA NA 3rd Qu.: 0.000 NA NA NA NA NA
Max NA NA Max. :5.000 Max. :78158306 NA NA Max. :400.000 NA NA NA NA NA

Now there are no missing values

##Checking for Outliers For rating by seeing frequency for each rating

 breaks = seq(15,20,by = 1)
frequency_table = table(data_clean1$Rating)
frequency_table
## 
##                1              1.2              1.4              1.5 
##               16                1                3                3 
##              1.6              1.7              1.8              1.9 
##                4                8                8               11 
##                2              2.1              2.2              2.3 
##               12                8               14               20 
##              2.4              2.5              2.6              2.7 
##               19               20               24               23 
##              2.8              2.9                3              3.1 
##               40               45               81               69 
##              3.2              3.3              3.4              3.5 
##               63              100              126              156 
##              3.6              3.7              3.8              3.9 
##              167              224              286              359 
##                4              4.1 4.17324304538799              4.2 
##              513              621             1463              810 
##              4.3              4.4              4.5              4.6 
##              897              895              848              683 
##              4.7              4.8              4.9                5 
##              442              221               85              271

From above it can be seen all the rating are between 1 and 5

##Visualising the Rating Distribution

boxplot(data_clean1$Rating,ylab = "Rating", xlab = "Count",col = "Blue")

hist(data_clean1$Rating, main="Histogram of Apps Rating after cleaning", xlab="Rating (count)", col = 'blue', breaks = 100 )

qqnorm(data_clean1$Rating)
qqline(data_clean$Rating, col = "red")

Here, it could be seen the plots are much clearer but still skewed due to other outliers from 1-3 rating but as these may be the reason from which we could find why the apps are low rated hencecannot be removed from our dataset.

##Plotting for Reviews

boxplot(data_clean1$Reviews,ylab = "Reviews", xlab = "Count",col = 'Blue')

hist(data_clean1$Reviews, main="Histogram of Apps Reviews", xlab="Reviews (count)", col = 'blue', breaks = 100 )

ggplot(data_clean1, aes(x = log(Reviews))) +
  geom_histogram(binwidth = 0.1, fill = "blue", color = "black") +
  labs(title = "Log-Transformed Histogram of Ratings", x = "Log(Rating)", y = "Count")

qqnorm(data_clean1$Reviews)
qqline(data_clean1$Reviews, col = "red")

Similar to the case of ratings the plots are skewed due to the outliers. Hence, we can use the log plot of reviews for the visualisation which is normalised version of Reviews. As they are skewed, they donot follow normal distribution

##Review frequency table

xkablesummary(data_clean1)
Table: Statistics summary.
App Category Rating Reviews Size Installs Price Content.Rating Genres Last.Updated Current.Ver Android.Ver
Min Length:9659 Length:9659 Min. :1.000 Min. : 0 Length:9659 Length:9659 Min. : 0.000 Length:9659 Length:9659 Length:9659 Length:9659 Length:9659
Q1 Class :character Class :character 1st Qu.:4.000 1st Qu.: 25 Class :character Class :character 1st Qu.: 0.000 Class :character Class :character Class :character Class :character Class :character
Median Mode :character Mode :character Median :4.200 Median : 967 Mode :character Mode :character Median : 0.000 Mode :character Mode :character Mode :character Mode :character Mode :character
Mean NA NA Mean :4.173 Mean : 216593 NA NA Mean : 1.099 NA NA NA NA NA
Q3 NA NA 3rd Qu.:4.500 3rd Qu.: 29401 NA NA 3rd Qu.: 0.000 NA NA NA NA NA
Max NA NA Max. :5.000 Max. :78158306 NA NA Max. :400.000 NA NA NA NA NA
outlierKD2(data_clean1,Reviews)
## Outliers identified: 1656 
## Proportion (%) of outliers: 20.7 
## Mean of the outliers: 1228141 
## Mean without removing outliers: 216592.6 
## Mean if we remove outliers: 7280.61 
## Nothing changed

To check which are outliers lets make sections of data that is create bins to check which bins have maximum data, this would help us see how reviews are distributed

##Binned reviews

Binning into equal count in each bin to check averge rating for each bin

# Define the new custom breaks for bins
# Ensure there are no NA values


# Define new breaks for more even intervals
breaks <- c(0, 100, 500, 1000, 2500, 5000, 10000, 25000,50000,100000, 300000,1000000,Inf)

# Create a categorical variable based on the new breaks
Review_Category <- cut(data_clean1$Reviews, breaks = breaks, right = FALSE, 
                   labels = c("0+","100+", "500+", "1K+",
                              "2.5K+", "5K+", "10K+","25K+",
                              "50K+", "100K+","300K+","1M+"))

# Count the number of values in each bin
bin_counts <- as.data.frame(table(Review_Category))

# Rename the columns for clarity
colnames(bin_counts) <- c("Review_Category", "Count")

# Print the counts
print(bin_counts)
##    Review_Category Count
## 1               0+  3327
## 2             100+  1065
## 3             500+   462
## 4              1K+   586
## 5            2.5K+   475
## 6              5K+   474
## 7             10K+   719
## 8             25K+   606
## 9             50K+   498
## 10           100K+   647
## 11           300K+   451
## 12             1M+   349
# Create a line plot of the binned counts
ggplot(bin_counts, aes(x = Review_Category, y = Count, group = 1)) +
  geom_line(color = "blue", size = 1) +
  geom_point(color = "blue", size = 3) +
  labs(title = "Count of Reviews by Review Category", 
       x = "Review Category", 
       y = "Count of Reviews") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate x-axis labels for readability

Hence, high reviews can be observed in less apps and less reviews can be observed in more apps which is expected

Rating vs Reviews boxplots

boxplot( data_clean1$Rating~ Review_Category, data = data_clean1, 
        main = "Boxplot of Review Counts by Review Category", 
        xlab = "Review Category", 
        ylab = "Review Rating",
        las = 2,        # Rotate the x-axis labels for readability
        col = "lightblue")  # Optional: Set color for the boxplots

In this we could observe that, as reviews increase the median of rating increased and the values clustered around higher ratings which could show that high reviews, could mean a better rated app.

Mean value of Ratings for each Review bins

library(dplyr)

                                    

# Calculate the mean Rating for each Review_Category
mean_ratings <- tapply(data_clean1$Rating, Review_Category, mean, na.rm = TRUE)

# Convert the result to a data frame for better readability
mean_ratings_df <- data.frame(Review_Category = names(mean_ratings), Mean_Rating = as.numeric(mean_ratings))

# Print the mean ratings for each review bin
print(mean_ratings_df)
##    Review_Category Mean_Rating
## 1               0+    4.126221
## 2             100+    4.029538
## 3             500+    4.063188
## 4              1K+    4.107030
## 5            2.5K+    4.129572
## 6              5K+    4.191139
## 7             10K+    4.221836
## 8             25K+    4.231848
## 9             50K+    4.293775
## 10           100K+    4.329830
## 11           300K+    4.375610
## 12             1M+    4.426361
# Define correct order of Review_Category as a factor
mean_ratings_df$Review_Category <- factor(mean_ratings_df$Review_Category, 
                                          levels = c("0+","100+", "500+", "1K+",
                                                     "2.5K+", "5K+", "10K+","25K+",
                                                     "50K+", "100K+", "300K+", "1M+"))

# Plot the mean ratings for each review bin in the correct order
ggplot(mean_ratings_df, aes(x = Review_Category, y = Mean_Rating)) +
  geom_bar(stat = "identity", fill = "steelblue") +  # Use bar plot
  labs(title = "Mean Rating by Review Category",
       x = "Review Category",
       y = "Mean Rating") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate x-axis labels for readability

As we can see, the mean rating increases as the reviews increase.

##Histogram of Reviews and Rating

# Create a new data frame for plotting
plot_data <- data.frame(Rating = data_clean1$Rating, Review_Category = Review_Category)

# Create a histogram of Ratings, faceted by Review_Category
ggplot(plot_data, aes(x = Rating)) +
  geom_histogram(bins = 30, fill = "blue", alpha = 0.7) +
  facet_wrap(~ Review_Category, labeller = label_wrap_gen()) +  # Facet by Review_Category
  theme_minimal() +
  labs(title = "Histograms of Ratings by Review Category", x = "Rating", y = "Frequency")

This is another representation of ratings vs reviews

ANOVA test

The tests below are to test whether or not different categories have different average ratings.

anova_result <- aov(Rating ~ as.factor(Review_Category), data = data_clean1)
summary(anova_result)
##                              Df Sum Sq Mean Sq F value Pr(>F)    
## as.factor(Review_Category)   11  106.3   9.662   41.36 <2e-16 ***
## Residuals                  9647 2253.6   0.234                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

According to p-value, it is significant hence we can say that the average rating for all review categories is not same

Post Hoc Test

# Perform Tukey's HSD
tukey_result <- TukeyHSD(anova_result)
tukey_result
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = Rating ~ as.factor(Review_Category), data = data_clean1)
## 
## $`as.factor(Review_Category)`
##                     diff          lwr         upr     p adj
## 100+-0+     -0.096683215 -0.152307271 -0.04105916 0.0000009
## 500+-0+     -0.063032835 -0.141474646  0.01540898 0.2646281
## 1K+-0+      -0.019190832 -0.089971134  0.05158947 0.9992526
## 2.5K+-0+     0.003350463 -0.074143085  0.08084401 1.0000000
## 5K+-0+       0.064918154 -0.012646893  0.14248320 0.2087515
## 10K+-0+      0.095614797  0.030638525  0.16059107 0.0000973
## 25K+-0+      0.105627098  0.035846939  0.17540726 0.0000488
## 50K+-0+      0.167554014  0.091642554  0.24346547 0.0000000
## 100K+-0+     0.203608898  0.135724795  0.27149300 0.0000000
## 300K+-0+     0.249388670  0.170111342  0.32866600 0.0000000
## 1M+-0+       0.300139945  0.211244127  0.38903576 0.0000000
## 500+-100+    0.033650380 -0.054364565  0.12166533 0.9848292
## 1K+-100+     0.077492383 -0.003768703  0.15875347 0.0784345
## 2.5K+-100+   0.100033678  0.012862795  0.18720456 0.0096675
## 5K+-100+     0.161601369  0.074366918  0.24883582 0.0000001
## 10K+-100+    0.192298012  0.116039053  0.26855697 0.0000000
## 25K+-100+    0.202310313  0.121918874  0.28270175 0.0000000
## 50K+-100+    0.264237229  0.178469737  0.35000472 0.0000000
## 100K+-100+   0.300292113  0.221540831  0.37904339 0.0000000
## 300K+-100+   0.346071885  0.257311491  0.43483228 0.0000000
## 1M+-100+     0.396823160  0.299375844  0.49427048 0.0000000
## 1K+-500+     0.043842003 -0.054455739  0.14213974 0.9515761
## 2.5K+-500+   0.066383298 -0.036853541  0.16962014 0.6214468
## 5K+-500+     0.127950989  0.024660470  0.23124151 0.0030189
## 10K+-500+    0.158647632  0.064443010  0.25285225 0.0000025
## 25K+-500+    0.168659933  0.071079887  0.26623998 0.0000011
## 50K+-500+    0.230586849  0.128532233  0.33264146 0.0000000
## 100K+-500+   0.266641733  0.170408442  0.36287502 0.0000000
## 300K+-500+   0.312421505  0.207839051  0.41700396 0.0000000
## 1M+-500+     0.363172780  0.251123410  0.47522215 0.0000000
## 2.5K+-1K+    0.022541295 -0.075001405  0.12008400 0.9998394
## 5K+-1K+      0.084108986 -0.013490527  0.18170850 0.1727899
## 10K+-1K+     0.114805629  0.026878134  0.20273312 0.0012014
## 25K+-1K+     0.124817930  0.033283243  0.21635262 0.0005180
## 50K+-1K+     0.186744846  0.090454254  0.28303544 0.0000000
## 100K+-1K+    0.222799730  0.132702117  0.31289734 0.0000000
## 300K+-1K+    0.268579502  0.169613735  0.36754527 0.0000000
## 1M+-1K+      0.319330777  0.212504774  0.42615678 0.0000000
## 5K+-2.5K+    0.061567691 -0.041004546  0.16413993 0.7193424
## 10K+-2.5K+   0.092264334 -0.001152170  0.18568084 0.0565429
## 25K+-2.5K+   0.102276635  0.005457227  0.19909604 0.0276896
## 50K+-2.5K+   0.164203551  0.062875978  0.26553112 0.0000078
## 100K+-2.5K+  0.200258435  0.104796512  0.29572036 0.0000000
## 300K+-2.5K+  0.246038206  0.142165102  0.34991131 0.0000000
## 1M+-2.5K+    0.296789482  0.185401898  0.40817707 0.0000000
## 10K+-5K+     0.030696643 -0.062779181  0.12417247 0.9957463
## 25K+-5K+     0.040708944 -0.056167701  0.13758559 0.9685508
## 50K+-5K+     0.102635860  0.001253596  0.20401812 0.0440982
## 100K+-5K+    0.138690744  0.043170771  0.23421072 0.0001331
## 300K+-5K+    0.184470516  0.080544059  0.28839697 0.0000004
## 1M+-5K+      0.235221791  0.123784453  0.34665913 0.0000000
## 25K+-10K+    0.010012302 -0.077112114  0.09713672 0.9999999
## 50K+-10K+    0.071939217 -0.020169104  0.16404754 0.3070668
## 100K+-10K+   0.107994101  0.022380758  0.19360745 0.0022235
## 300K+-10K+   0.153773873  0.058872409  0.24867534 0.0000078
## 1M+-10K+     0.204525148  0.101453039  0.30759726 0.0000000
## 50K+-25K+    0.061926916 -0.033630908  0.15748474 0.6094814
## 100K+-25K+   0.097981800  0.008667751  0.18729585 0.0175649
## 300K+-25K+   0.143761571  0.045508620  0.24201452 0.0001113
## 1M+-25K+     0.194512847  0.088346871  0.30067882 0.0000001
## 100K+-50K+   0.036054884 -0.058127272  0.13023704 0.9846717
## 300K+-50K+   0.081834656 -0.020863551  0.18453286 0.2768896
## 1M+-50K+     0.132585931  0.022293168  0.24287869 0.0048805
## 300K+-100K+  0.045779772 -0.051135776  0.14269532 0.9282456
## 1M+-100K+    0.096531047 -0.008398431  0.20146052 0.1064662
## 1M+-300K+    0.050751275 -0.061884591  0.16338714 0.9479902
# Convert the result to a data frame
tukey_df <- as.data.frame(tukey_result$`as.factor(Review_Category)`)

# Filter for significant p-values
significant_tukey <- tukey_df[tukey_df[4] < 0.05, ]

# Display the significant results
print(significant_tukey)
##                    diff          lwr         upr        p adj
## 100+-0+     -0.09668322 -0.152307271 -0.04105916 8.987756e-07
## 10K+-0+      0.09561480  0.030638525  0.16059107 9.732720e-05
## 25K+-0+      0.10562710  0.035846939  0.17540726 4.884843e-05
## 50K+-0+      0.16755401  0.091642554  0.24346547 0.000000e+00
## 100K+-0+     0.20360890  0.135724795  0.27149300 0.000000e+00
## 300K+-0+     0.24938867  0.170111342  0.32866600 0.000000e+00
## 1M+-0+       0.30013994  0.211244127  0.38903576 0.000000e+00
## 2.5K+-100+   0.10003368  0.012862795  0.18720456 9.667490e-03
## 5K+-100+     0.16160137  0.074366918  0.24883582 9.538328e-08
## 10K+-100+    0.19229801  0.116039053  0.26855697 0.000000e+00
## 25K+-100+    0.20231031  0.121918874  0.28270175 0.000000e+00
## 50K+-100+    0.26423723  0.178469737  0.35000472 0.000000e+00
## 100K+-100+   0.30029211  0.221540831  0.37904339 0.000000e+00
## 300K+-100+   0.34607188  0.257311491  0.43483228 0.000000e+00
## 1M+-100+     0.39682316  0.299375844  0.49427048 0.000000e+00
## 5K+-500+     0.12795099  0.024660470  0.23124151 3.018884e-03
## 10K+-500+    0.15864763  0.064443010  0.25285225 2.473396e-06
## 25K+-500+    0.16865993  0.071079887  0.26623998 1.080775e-06
## 50K+-500+    0.23058685  0.128532233  0.33264146 0.000000e+00
## 100K+-500+   0.26664173  0.170408442  0.36287502 0.000000e+00
## 300K+-500+   0.31242150  0.207839051  0.41700396 0.000000e+00
## 1M+-500+     0.36317278  0.251123410  0.47522215 0.000000e+00
## 10K+-1K+     0.11480563  0.026878134  0.20273312 1.201416e-03
## 25K+-1K+     0.12481793  0.033283243  0.21635262 5.179950e-04
## 50K+-1K+     0.18674485  0.090454254  0.28303544 1.572425e-08
## 100K+-1K+    0.22279973  0.132702117  0.31289734 0.000000e+00
## 300K+-1K+    0.26857950  0.169613735  0.36754527 0.000000e+00
## 1M+-1K+      0.31933078  0.212504774  0.42615678 0.000000e+00
## 25K+-2.5K+   0.10227664  0.005457227  0.19909604 2.768961e-02
## 50K+-2.5K+   0.16420355  0.062875978  0.26553112 7.808701e-06
## 100K+-2.5K+  0.20025843  0.104796512  0.29572036 3.507883e-10
## 300K+-2.5K+  0.24603821  0.142165102  0.34991131 0.000000e+00
## 1M+-2.5K+    0.29678948  0.185401898  0.40817707 0.000000e+00
## 50K+-5K+     0.10263586  0.001253596  0.20401812 4.409823e-02
## 100K+-5K+    0.13869074  0.043170771  0.23421072 1.331239e-04
## 300K+-5K+    0.18447052  0.080544059  0.28839697 4.428778e-07
## 1M+-5K+      0.23522179  0.123784453  0.34665913 2.244942e-10
## 100K+-10K+   0.10799410  0.022380758  0.19360745 2.223466e-03
## 300K+-10K+   0.15377387  0.058872409  0.24867534 7.832139e-06
## 1M+-10K+     0.20452515  0.101453039  0.30759726 5.942656e-09
## 100K+-25K+   0.09798180  0.008667751  0.18729585 1.756493e-02
## 300K+-25K+   0.14376157  0.045508620  0.24201452 1.113055e-04
## 1M+-25K+     0.19451285  0.088346871  0.30067882 1.436204e-07
## 1M+-50K+     0.13258593  0.022293168  0.24287869 4.880458e-03

As we can see, the significant difference for average rating for different review categories is between 0+ and 1M+ as expected.

##For easier Ratings and Reviews vs Installs we can group Installs into categories given

# Load necessary libraries
library(ggplot2)

# Step 1: Identify the unique values in the 'Installs' column
unique_values <- unique(data_clean1$Installs)

# Function to convert installs to numeric
convert_to_numeric <- function(x) {
  # Remove non-numeric characters and convert to numeric
  as.numeric(gsub("[^0-9]", "", x)) * 10^(length(gregexpr(",", x)[[1]]) - 1)
}

# Sort unique values based on the custom numeric conversion
sorted_values <- unique_values[order(sapply(unique_values, convert_to_numeric))]

# Create a bar plot with the ordered factor without adding a new column
ggplot(data = data_clean1, aes(x = factor(Installs, levels = sorted_values))) +
  geom_bar(fill = "blue", alpha = 0.7) +
  xlab("Installs") +
  ylab("Count") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +  # Rotate x-axis labels for readability
  ggtitle("Distribution of App Installs")

##Ratings vs Installs

Now we can check what is the average rating for each Install category and what is the relationship between them

# Load necessary libraries
library(ggplot2)
library(dplyr)

# Function to convert installs to numeric
convert_to_numeric <- function(x) {
  as.numeric(gsub("[^0-9]", "", x)) * 10^(length(gregexpr(",", x)[[1]]) - 1)
}

# Step 1: Calculate mean ratings and counts for each install category using dplyr
data_mean <- data_clean1 %>%
  group_by(Installs) %>%
  summarise(Mean_Rating = mean(Rating, na.rm = TRUE), Count = n()) %>%
  ungroup()

# Sort install categories
sorted_installs <- data_mean$Installs[order(sapply(data_mean$Installs, convert_to_numeric))]

# Create dot plot with size based on the count of ratings
ggplot(data_mean, aes(x = factor(Installs, levels = sorted_installs), y = Mean_Rating)) +
  geom_point(aes(size = Count), color = "blue", alpha = 0.7) +  # Size based on count of ratings
  geom_segment(aes(x = factor(Installs, levels = sorted_installs), 
                   xend = factor(Installs, levels = sorted_installs), 
                   y = 0, yend = Mean_Rating), color = "grey", linetype = "dashed") +
  labs(title = "Mean Ratings by Install Category", x = "Install Categories", y = "Mean Ratings") +
  scale_size_continuous(name = "Number of Ratings") +  # Add legend for size
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate x-axis labels for readability

The analysis reveals that both low and high install counts correspond to high ratings. However, apps with a greater number of installs and high ratings are generally regarded as superior, as indicated by the density of the dots in the plot, which reflects the volume of ratings they have received

Category

length(unique(data_clean$Category))
## [1] 33
length(unique(data_clean$Genres))
## [1] 118

There are 34 categories in the the dataframe with 119 genres. This means that in each category, there are multiple genres. Given that, the later analyses in this project can be proceeded with Category variable.

Below is the graph for the distribution of Categories for the dataset after removing duplicates.

#Distribution for Category
category_counts <- table(data_clean$Category)

# Convert to data frame for plotting
category_counts_df <- as.data.frame(category_counts)
colnames(category_counts_df) <- c("Category", "Frequency") 

ggplot(category_counts_df, aes(x = reorder(Category, Frequency), y = Frequency)) + 
  geom_bar(stat = "identity", fill = "skyblue") +
  geom_text(aes(label = Frequency), vjust = 0.5, hjust=1, size=2.5) +
  coord_flip() +  
  labs(title = "Distribution of Categories", x = "Category", y = "Frequency") +
  theme_minimal() +
  theme(axis.text.y = element_text(size = 5.5)) 

## Category vs. Installs

library(DT)
#clean installations
clean_installs <- function(Installs) {
  Installs <- gsub("\\+", "", Installs)  # Remove the '+' sign
  Installs <- gsub(",", "", Installs)    # Remove the commas
  return(as.numeric(Installs))           # Convert to numeric
}

data_clean$Installs <- sapply(data_clean$Installs, clean_installs)

nan_rows <- sapply(data_clean[, c("Size", "Installs")], function(x) any(is.nan(x)))

# Display only rows that contain NaN in either Size or Installs
data_clean[,nan_rows]
## data frame with 0 columns and 9659 rows
datatable((data_clean), options = list(scrollX = TRUE ))
# Step 1: Identify the unique values in the 'Installs' column
unique_values <- unique(data_clean1$Installs)

# Display the unique values
print(unique_values)
##  [1] "10,000+"        "500,000+"       "5,000,000+"     "50,000,000+"   
##  [5] "100,000+"       "50,000+"        "1,000,000+"     "10,000,000+"   
##  [9] "5,000+"         "100,000,000+"   "1,000,000,000+" "1,000+"        
## [13] "500,000,000+"   "50+"            "100+"           "500+"          
## [17] "10+"            "1+"             "5+"             "0+"            
## [21] "0"
# Function to convert the installs to numeric
convert_to_numeric <- function(x) {
  # Remove non-numeric characters and convert to numeric
  as.numeric(gsub("[^0-9]", "", x)) * 10^(length(gregexpr(",", x)[[1]]) - 1)
}

# Sort unique values based on the custom numeric conversion
sorted_values <- unique_values[order(sapply(unique_values, convert_to_numeric))]

# Create a new data frame to store the factor levels
data_clean1_factor <- data_clean1  # Assuming you want to keep the original data intact
data_clean1_factor$Installs <- factor(data_clean1$Installs, levels = sorted_values)

# Create a bar plot with the ordered factor
ggplot(data_clean1_factor, aes(x = Installs)) +
  geom_bar() +
  xlab("Installs") +
  ylab("Count") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +  # Rotate x-axis labels for readability
  ggtitle("Distribution of App Installs")

# Scatter plot for Installs vs Reviews
ggplot(data_clean1_factor, aes(x = Reviews, y = Installs)) +
  geom_point(color = "blue", alpha = 0.5) +
  labs(title = "Scatter Plot of Installs vs Reviews", 
       x = "Number of Reviews", 
       y = "Number of Installs") +
  theme_minimal()

# Log-transform the Installs
data_clean$log_Installs <- log(data_clean$Installs)

# Scatter plot of log-transformed Installs vs. Rating
ggplot(data_clean, aes(x = log_Installs, y = Rating)) +
  geom_point(color = "blue", alpha = 0.6) +
  geom_smooth(method = "lm", color = "red", se = FALSE) +  # Add a regression line
  labs(title = "Log-Transformed Installs vs. Rating", 
       x = "Log(Installs)", 
       y = "Rating") +
  theme_minimal()